In [24]:
import xml.etree.ElementTree as ET # Use cElementTree or lxml if too slow
OSM_FILE = "/Users/yangrenqin/udacity/P3/san-francisco.osm" # Replace this with your osm file
SAMPLE_FILE = "/Users/yangrenqin/udacity/P3/sample1.osm"
k = 30 # Parameter: take every k-th top level element
def get_element(osm_file, tags=('node', 'way', 'relation')):
context = iter(ET.iterparse(osm_file, events=('start', 'end')))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
yield elem
root.clear()
with open(SAMPLE_FILE, 'w') as output:
output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
output.write('<osm>\n ')
# Write every kth top level element
for i, element in enumerate(get_element(OSM_FILE)):
if i % k == 0:
output.write(ET.tostring(element, encoding='unicode'))
output.write('</osm>')
In [10]:
from collections import defaultdict
filename='/Users/yangrenqin/udacity/P3/san-francisco.osm'
def count_tags(filename):
tags=defaultdict(int)
for _,elem in (ET.iterparse(filename)):
tags[elem.tag] += 1
return tags
count_tags(filename)
Out[10]:
In [31]:
import re
filename='/Users/yangrenqin/udacity/P3/san-francisco.osm'
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
lower_colons=re.compile(r'^([a-z]|_)*(:([a-z]|_)*)+$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
def key_type(element, keys,other):
if element.tag == "tag":
a=lower.search(element.attrib['k'])
b=lower_colon.search(element.attrib['k'])
c=problemchars.search(element.attrib['k'])
d=lower_colons.search(element.attrib['k'])
if a:
keys['lower'] += 1
elif b:
keys['lower_colon'] += 1
elif c:
keys['problemchars'] += 1
elif d:
keys['lower_colons'] += 1
else:
keys['other'] += 1
other.append(element.attrib['k'])
return keys,other
def process_map(filename):
keys = {"lower": 0, "lower_colon": 0, "lower_colons":0, "problemchars": 0, "other": 0}
other=[]
for _, element in ET.iterparse(filename):
keys,other = key_type(element, keys,other)
return keys,other
keys,others=process_map(filename)
print(keys)
In [11]:
import xml.etree.ElementTree as ET
from collections import defaultdict
import re
filename='/Users/yangrenqin/udacity/P3/san-francisco.osm'
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
"Trail", "Parkway", "Commons", "Way", "Highway", "Path", "Terrace", "Alley", "Center",
"Circle", "Plaza", "Real"]
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_types=defaultdict(set)
def audit_street_type(street_types, street_name):
m = street_type_re.search(street_name)
if m:
street_type = m.group()
if street_type not in expected:
street_types[street_type].add(street_name)
def is_street_name(elem):
return (elem.attrib['k'] == "addr:street")
def audit(osmfile):
osm_file = open(osmfile, "r")
for event, elem in ET.iterparse(osm_file, events=("start",)):
if elem.tag == "node" or elem.tag == "way":
for tag in elem.iter("tag"):
if is_street_name(tag):
audit_street_type(street_types, tag.attrib['v'])
osm_file.close()
return street_types
error_street_type=audit(filename)
error_street_type
Out[11]:
In [207]:
street_mapping = { "St": "Street",
"St.": "Street",
"Steet": "Street",
"st": "Street",
"street": "Street",
"Ave": "Avenue",
"Ave.": "Avenue",
"ave": "Avenue",
"avenue": "Avenue",
"Rd.": "Road",
"Rd": "Road",
"Blvd": "Boulevard",
"Blvd,": "Boulevard",
"Blvd.": "Boulevard",
"Boulavard": "Boulevard",
"Boulvard": "Boulevard",
"Dr": "Drive",
"Dr.": "Drive",
"Pl": "Plaza",
"Plz": "Plaza",
"square": "Square"
}
postcode_mapping={"CA 94030": "94030",
"CA 94133": "94133",
"CA 94544": "94544",
"CA 94103": "94103",
"CA:94103": "94103"
}
error_postcode={'1087', '515', 'CA'}
cityname_mapping={"Berkeley, CA": "Berkeley",
"Fremont ": "Fremont",
"Oakland, CA": "Oakland",
"Oakland, Ca": "Oakland",
"San Francisco, CA": "San Francisco",
"San Francisco, CA 94102": "San Francisco",
"San Francicsco": "San Francisco",
"San Fransisco": "San Francisco",
"San Francsico": "San Francisco",
"Artherton": "Atherton"
}
error_cityname={'155', '157'}
In [14]:
filename='/Users/yangrenqin/udacity/P3/san-francisco.osm'
def is_street_postcode(elem):
return (elem.attrib['k'] == "addr:postcode")
def audit(osmfile):
osm_file = open(osmfile, "r")
postcode_types = set()
for event, elem in ET.iterparse(osm_file, events=("start",)):
if elem.tag == "node" or elem.tag == "way":
for tag in elem.iter("tag"):
if is_street_postcode(tag):
postcode_types.add(tag.attrib['v'])
osm_file.close()
return postcode_types
postcode=audit(filename)
postcode
Out[14]:
In [15]:
filename='/Users/yangrenqin/udacity/P3/san-francisco.osm'
def is_city(elem):
return (elem.attrib['k'] == "addr:city")
def audit(osmfile):
osm_file = open(osmfile, "r")
city_types = set()
for event, elem in ET.iterparse(osm_file, events=("start",)):
if elem.tag == "node" or elem.tag == "way":
for tag in elem.iter("tag"):
if is_city(tag):
city_types.add(tag.attrib['v'])
osm_file.close()
return city_types
cityname=audit(filename)
cityname
Out[15]:
In [218]:
import csv
import codecs
import re
import xml.etree.cElementTree as ET
OSM_PATH = "/Users/yangrenqin/udacity/P3/san-francisco.osm"
NODES_PATH = "/Users/yangrenqin/udacity/P3/nodes.csv"
NODE_TAGS_PATH = "/Users/yangrenqin/udacity/P3/nodes_tags.csv"
WAYS_PATH = "/Users/yangrenqin/udacity/P3/ways.csv"
WAY_NODES_PATH = "/Users/yangrenqin/udacity/P3/ways_nodes.csv"
WAY_TAGS_PATH = "/Users/yangrenqin/udacity/P3/ways_tags.csv"
LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']
def capitalize(a):
b=''
for i in a.split(' '):
i=i.capitalize()+' '
b=b+i
a=b.strip()
return a
def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
problem_chars=PROBLEMCHARS, default_tag_type='regular'):
node_attribs = {}
way_attribs = {}
way_nodes = []
tags = [] # Handle secondary tags the same way for both node and way elements
if element.tag == 'node':
for i in element.attrib:
if i in node_attr_fields:
node_attribs[i]=element.attrib[i]
if element.getchildren() == []:
pass
else:
for tag in element.iter('tag'):
node_tags={}
k=tag.attrib['k']
if PROBLEMCHARS.search(k):
continue
elif LOWER_COLON.search(k):
if LOWER_COLON.search(k).group() == k:
if k == "addr:street":
m = street_type_re.search(tag.attrib['v'])
if m:
street_type = m.group()
if street_type in error_street_type:
if street_type in street_mapping:
tag.attrib['v']=tag.attrib['v'].replace(street_type,street_mapping[street_type])
else:
continue
node_tags['key']=k.split(':')[1]
node_tags['type']=k.split(':')[0]
node_tags['id']=element.attrib['id']
node_tags['value']=tag.attrib['v']
tags.append(node_tags)
else:
continue
if k == "addr:postcode":
if tag.attrib['v'] in error_postcode:
continue
else:
if tag.attrib['v'] in postcode_mapping:
tag.attrib['v']=postcode_mapping[tag.attrib['v']]
node_tags['key']=k.split(':')[1]
node_tags['type']=k.split(':')[0]
node_tags['id']=element.attrib['id']
node_tags['value']=tag.attrib['v']
tags.append(node_tags)
if k == "addr:city":
if tag.attrib['v'] in error_cityname:
continue
else:
if tag.attrib['v'] in cityname_mapping:
tag.attrib['v']=cityname_mapping[tag.attrib['v']]
node_tags['key']=k.split(':')[1]
node_tags['type']=k.split(':')[0]
node_tags['id']=element.attrib['id']
node_tags['value']=capitalize(tag.attrib['v'])
tags.append(node_tags)
else:
node_tags['key']=k.partition(':')[-1]
node_tags['type']=k.partition(':')[0]
node_tags['id']=element.attrib['id']
node_tags['value']=tag.attrib['v']
tags.append(node_tags)
else:
node_tags['id']=element.attrib['id']
node_tags['value']=tag.attrib['v']
node_tags['key']=k
node_tags['type']=default_tag_type
tags.append(node_tags)
if element.tag == 'way':
for i in element.attrib:
if i in way_attr_fields:
way_attribs[i]=element.attrib[i]
if element.getchildren() == []:
pass
else:
for tag in element.iter('tag'):
way_tags={}
k=tag.attrib['k']
if PROBLEMCHARS.search(k):
continue
elif LOWER_COLON.search(k):
if LOWER_COLON.search(k).group() == k:
if k == "addr:street":
m = street_type_re.search(tag.attrib['v'])
if m:
street_type = m.group()
if street_type in error_street_type:
if street_type in street_mapping:
tag.attrib['v']=tag.attrib['v'].replace(street_type,street_mapping[street_type])
else:
continue
way_tags['key']=k.split(':')[1]
way_tags['type']=k.split(':')[0]
way_tags['id']=element.attrib['id']
way_tags['value']=tag.attrib['v']
tags.append(way_tags)
else:
continue
if k == "addr:postcode":
if tag.attrib['v'] in error_postcode:
continue
else:
if tag.attrib['v'] in postcode_mapping:
tag.attrib['v']=postcode_mapping[tag.attrib['v']]
way_tags['key']=k.split(':')[1]
way_tags['type']=k.split(':')[0]
way_tags['id']=element.attrib['id']
way_tags['value']=tag.attrib['v']
tags.append(way_tags)
if k == "addr:city":
if tag.attrib['v'] in error_cityname:
continue
else:
if tag.attrib['v'] in cityname_mapping:
tag.attrib['v']=cityname_mapping[tag.attrib['v']]
way_tags['key']=k.split(':')[1]
way_tags['type']=k.split(':')[0]
way_tags['id']=element.attrib['id']
way_tags['value']=capitalize(tag.attrib['v'])
tags.append(way_tags)
else:
way_tags['key']=k.partition(':')[-1]
way_tags['type']=k.partition(':')[0]
way_tags['id']=element.attrib['id']
way_tags['value']=tag.attrib['v']
tags.append(way_tags)
else:
way_tags['id']=element.attrib['id']
way_tags['value']=tag.attrib['v']
way_tags['key']=k
way_tags['type']=default_tag_type
tags.append(way_tags)
for i,nd in enumerate(element.iter('nd')):
way_nd={}
way_nd['id']=element.attrib['id']
way_nd['node_id']=nd.attrib['ref']
way_nd['position']=i
way_nodes.append(way_nd)
if element.tag == 'node':
if element.getchildren() == []:
return {'node': node_attribs}
else:
return {'node': node_attribs, 'node_tags': tags}
elif element.tag == 'way':
if element.getchildren() == []:
return {'way': way_attribs}
else:
return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
In [219]:
def get_element(osm_file, tags=('node', 'way')):
"""Yield element if it is the right type of tag"""
context = ET.iterparse(osm_file, events=('start', 'end'))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
yield elem
root.clear()
def is_numeric(s):
try:
float(s)
return True
except ValueError:
return False
def keep_numeric(original):
for i,v in original.items():
if is_numeric(v):
if float(v).is_integer():
original[i]=int(float(v))
else:
original[i]=float(v)
return original
def process_map(file_in):
"""Iteratively process each XML element and write to csv(s)"""
with codecs.open(NODES_PATH, 'w') as nodes_file, codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
codecs.open(WAYS_PATH, 'w') as ways_file,codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:
nodes_writer = csv.DictWriter(nodes_file, NODE_FIELDS)
node_tags_writer = csv.DictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
ways_writer = csv.DictWriter(ways_file, WAY_FIELDS)
way_nodes_writer = csv.DictWriter(way_nodes_file, WAY_NODES_FIELDS)
way_tags_writer = csv.DictWriter(way_tags_file, WAY_TAGS_FIELDS)
nodes_writer.writeheader()
node_tags_writer.writeheader()
ways_writer.writeheader()
way_nodes_writer.writeheader()
way_tags_writer.writeheader()
for element in get_element(file_in, tags=('node', 'way')):
el = shape_element(element)
if el:
if element.tag == 'node':
if element.getchildren() == []:
nodes_writer.writerow(keep_numeric(el['node']))
else:
nodes_writer.writerow(keep_numeric(el['node']))
node_tags_writer.writerows([keep_numeric(i) for i in el['node_tags']])
elif element.tag == 'way':
if element.getchildren() == []:
ways_writer.writerow(keep_numeric(el['way']))
else:
ways_writer.writerow(keep_numeric(el['way']))
way_nodes_writer.writerows([keep_numeric(i) for i in el['way_nodes']])
way_tags_writer.writerows([keep_numeric(i) for i in el['way_tags']])
In [220]:
if __name__ == '__main__':
process_map(OSM_PATH)
In [223]:
import sqlite3
import pandas as pd
db = sqlite3.connect('sanfrancisco.db')
c=db.cursor()
query="SELECT tags.value, COUNT(*) as count\
FROM (SELECT * FROM nodes_tags\
UNION ALL\
SELECT * FROM ways_tags) tags\
WHERE tags.key='street'\
GROUP BY tags.value\
ORDER BY count DESC;"
c.execute(query)
rows=pd.DataFrame(c.fetchall(),columns=['Street','count'])
db.close()
In [228]:
rows.head(20)
Out[228]:
In [231]:
rows.tail(20)
Out[231]:
In [238]:
db = sqlite3.connect('sanfrancisco.db')
c=db.cursor()
query="SELECT tags.value, COUNT(*) as count\
FROM (SELECT * FROM nodes_tags\
UNION ALL\
SELECT * FROM ways_tags) tags\
WHERE tags.key='postcode'\
GROUP BY tags.value\
ORDER BY count DESC;"
c.execute(query)
rows=pd.DataFrame(c.fetchall(),columns=['Postcode','count'])
db.close()
In [239]:
rows.head(10)
Out[239]:
In [240]:
rows.tail(10)
Out[240]:
In [241]:
db = sqlite3.connect('sanfrancisco.db')
c=db.cursor()
query="SELECT tags.value, COUNT(*) as count\
FROM (SELECT * FROM nodes_tags\
UNION ALL\
SELECT * FROM ways_tags) tags\
WHERE tags.key='city'\
GROUP BY tags.value\
ORDER BY count DESC;"
c.execute(query)
rows=pd.DataFrame(c.fetchall(),columns=['City','count'])
db.close()
In [242]:
rows
Out[242]:
In [270]:
db = sqlite3.connect('sanfrancisco.db')
c=db.cursor()
query="SELECT nodes_tags.value,COUNT(*) as num\
FROM nodes_tags, (SELECT DISTINCT(id) FROM nodes_tags WHERE value='cafe') AS i\
WHERE nodes_tags.id=i.id AND nodes_tags.key='name'\
GROUP BY nodes_tags.value\
ORDER BY num desc\
LIMIT 1;"
c.execute(query)
result=c.fetchall()
db.close()
In [271]:
result
Out[271]:
I just repeatly use this code to perform different query to database.
In [ ]: